From Given various features, the aim is to build a predictive model to determine the income level for people in US. The income levels are binned at below 50K and above 50K.
getwd()
## [1] "C:/Users/Ashish Mishra/Desktop/DS/DataScienceProjects/CensusIncome"
setwd("C:/Users/Ashish Mishra/Desktop/DS/DataScienceProjects/CensusIncome")
#importing data.table library for faster computation
library(data.table)
#loading training and test data set
train <- fread("train.csv",na.strings = c(""," ","?","NA",NA))
##
Read 70.2% of 199523 rows
Read 199523 rows and 41 (of 41) columns from 0.098 GB file in 00:00:03
test <- fread("test.csv",na.strings = c(""," ","?","NA",NA))
dim(train)
## [1] 199523 41
str(train)
## Classes 'data.table' and 'data.frame': 199523 obs. of 41 variables:
## $ age : int 73 58 18 9 10 48 42 28 47 34 ...
## $ class_of_worker : chr "Not in universe" "Self-employed-not incorporated" "Not in universe" "Not in universe" ...
## $ industry_code : int 0 4 0 0 0 40 34 4 43 4 ...
## $ occupation_code : int 0 34 0 0 0 10 3 40 26 37 ...
## $ education : chr "High school graduate" "Some college but no degree" "10th grade" "Children" ...
## $ wage_per_hour : int 0 0 0 0 0 1200 0 0 876 0 ...
## $ enrolled_in_edu_inst_lastwk : chr "Not in universe" "Not in universe" "High school" "Not in universe" ...
## $ marital_status : chr "Widowed" "Divorced" "Never married" "Never married" ...
## $ major_industry_code : chr "Not in universe or children" "Construction" "Not in universe or children" "Not in universe or children" ...
## $ major_occupation_code : chr "Not in universe" "Precision production craft & repair" "Not in universe" "Not in universe" ...
## $ race : chr "White" "White" "Asian or Pacific Islander" "White" ...
## $ hispanic_origin : chr "All other" "All other" "All other" "All other" ...
## $ sex : chr "Female" "Male" "Female" "Female" ...
## $ member_of_labor_union : chr "Not in universe" "Not in universe" "Not in universe" "Not in universe" ...
## $ reason_for_unemployment : chr "Not in universe" "Not in universe" "Not in universe" "Not in universe" ...
## $ full_parttime_employment_stat : chr "Not in labor force" "Children or Armed Forces" "Not in labor force" "Children or Armed Forces" ...
## $ capital_gains : int 0 0 0 0 0 0 5178 0 0 0 ...
## $ capital_losses : int 0 0 0 0 0 0 0 0 0 0 ...
## $ dividend_from_Stocks : int 0 0 0 0 0 0 0 0 0 0 ...
## $ tax_filer_status : chr "Nonfiler" "Head of household" "Nonfiler" "Nonfiler" ...
## $ region_of_previous_residence : chr "Not in universe" "South" "Not in universe" "Not in universe" ...
## $ state_of_previous_residence : chr "Not in universe" "Arkansas" "Not in universe" "Not in universe" ...
## $ d_household_family_stat : chr "Other Rel 18+ ever marr not in subfamily" "Householder" "Child 18+ never marr Not in a subfamily" "Child <18 never marr not in subfamily" ...
## $ d_household_summary : chr "Other relative of householder" "Householder" "Child 18 or older" "Child under 18 never married" ...
## $ migration_msa : chr NA "MSA to MSA" NA "Nonmover" ...
## $ migration_reg : chr NA "Same county" NA "Nonmover" ...
## $ migration_within_reg : chr NA "Same county" NA "Nonmover" ...
## $ live_1_year_ago : chr "Not in universe under 1 year old" "No" "Not in universe under 1 year old" "Yes" ...
## $ migration_sunbelt : chr NA "Yes" NA "Not in universe" ...
## $ num_person_Worked_employer : int 0 1 0 0 0 1 6 4 5 6 ...
## $ family_members_under_18 : chr "Not in universe" "Not in universe" "Not in universe" "Both parents present" ...
## $ country_father : chr "United-States" "United-States" "Vietnam" "United-States" ...
## $ country_mother : chr "United-States" "United-States" "Vietnam" "United-States" ...
## $ country_self : chr "United-States" "United-States" "Vietnam" "United-States" ...
## $ citizenship : chr "Native- Born in the United States" "Native- Born in the United States" "Foreign born- Not a citizen of U S" "Native- Born in the United States" ...
## $ business_or_self_employed : int 0 0 0 0 0 2 0 0 0 0 ...
## $ fill_questionnaire_veteran_admin: chr "Not in universe" "Not in universe" "Not in universe" "Not in universe" ...
## $ veterans_benefits : int 2 2 2 0 0 2 2 2 2 2 ...
## $ weeks_worked_in_year : int 0 52 0 0 0 52 52 30 52 52 ...
## $ year : int 95 94 95 94 94 95 94 95 95 94 ...
## $ income_level : chr "-50000" "-50000" "-50000" "-50000" ...
## - attr(*, ".internal.selfref")=<externalptr>
summary(train)
## age class_of_worker industry_code occupation_code
## Min. : 0.00 Length:199523 Min. : 0.00 Min. : 0.00
## 1st Qu.:15.00 Class :character 1st Qu.: 0.00 1st Qu.: 0.00
## Median :33.00 Mode :character Median : 0.00 Median : 0.00
## Mean :34.49 Mean :15.35 Mean :11.31
## 3rd Qu.:50.00 3rd Qu.:33.00 3rd Qu.:26.00
## Max. :90.00 Max. :51.00 Max. :46.00
## education wage_per_hour enrolled_in_edu_inst_lastwk
## Length:199523 Min. : 0.00 Length:199523
## Class :character 1st Qu.: 0.00 Class :character
## Mode :character Median : 0.00 Mode :character
## Mean : 55.43
## 3rd Qu.: 0.00
## Max. :9999.00
## marital_status major_industry_code major_occupation_code
## Length:199523 Length:199523 Length:199523
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## race hispanic_origin sex
## Length:199523 Length:199523 Length:199523
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## member_of_labor_union reason_for_unemployment
## Length:199523 Length:199523
## Class :character Class :character
## Mode :character Mode :character
##
##
##
## full_parttime_employment_stat capital_gains capital_losses
## Length:199523 Min. : 0.0 Min. : 0.00
## Class :character 1st Qu.: 0.0 1st Qu.: 0.00
## Mode :character Median : 0.0 Median : 0.00
## Mean : 434.7 Mean : 37.31
## 3rd Qu.: 0.0 3rd Qu.: 0.00
## Max. :99999.0 Max. :4608.00
## dividend_from_Stocks tax_filer_status region_of_previous_residence
## Min. : 0.0 Length:199523 Length:199523
## 1st Qu.: 0.0 Class :character Class :character
## Median : 0.0 Mode :character Mode :character
## Mean : 197.5
## 3rd Qu.: 0.0
## Max. :99999.0
## state_of_previous_residence d_household_family_stat d_household_summary
## Length:199523 Length:199523 Length:199523
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## migration_msa migration_reg migration_within_reg
## Length:199523 Length:199523 Length:199523
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## live_1_year_ago migration_sunbelt num_person_Worked_employer
## Length:199523 Length:199523 Min. :0.000
## Class :character Class :character 1st Qu.:0.000
## Mode :character Mode :character Median :1.000
## Mean :1.956
## 3rd Qu.:4.000
## Max. :6.000
## family_members_under_18 country_father country_mother
## Length:199523 Length:199523 Length:199523
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## country_self citizenship business_or_self_employed
## Length:199523 Length:199523 Min. :0.0000
## Class :character Class :character 1st Qu.:0.0000
## Mode :character Mode :character Median :0.0000
## Mean :0.1754
## 3rd Qu.:0.0000
## Max. :2.0000
## fill_questionnaire_veteran_admin veterans_benefits weeks_worked_in_year
## Length:199523 Min. :0.000 Min. : 0.00
## Class :character 1st Qu.:2.000 1st Qu.: 0.00
## Mode :character Median :2.000 Median : 8.00
## Mean :1.515 Mean :23.17
## 3rd Qu.:2.000 3rd Qu.:52.00
## Max. :2.000 Max. :52.00
## year income_level
## Min. :94.0 Length:199523
## 1st Qu.:94.0 Class :character
## Median :94.0 Mode :character
## Mean :94.5
## 3rd Qu.:95.0
## Max. :95.0
head(train)
## age class_of_worker industry_code occupation_code
## 1: 73 Not in universe 0 0
## 2: 58 Self-employed-not incorporated 4 34
## 3: 18 Not in universe 0 0
## 4: 9 Not in universe 0 0
## 5: 10 Not in universe 0 0
## 6: 48 Private 40 10
## education wage_per_hour enrolled_in_edu_inst_lastwk
## 1: High school graduate 0 Not in universe
## 2: Some college but no degree 0 Not in universe
## 3: 10th grade 0 High school
## 4: Children 0 Not in universe
## 5: Children 0 Not in universe
## 6: Some college but no degree 1200 Not in universe
## marital_status major_industry_code
## 1: Widowed Not in universe or children
## 2: Divorced Construction
## 3: Never married Not in universe or children
## 4: Never married Not in universe or children
## 5: Never married Not in universe or children
## 6: Married-civilian spouse present Entertainment
## major_occupation_code race
## 1: Not in universe White
## 2: Precision production craft & repair White
## 3: Not in universe Asian or Pacific Islander
## 4: Not in universe White
## 5: Not in universe White
## 6: Professional specialty Amer Indian Aleut or Eskimo
## hispanic_origin sex member_of_labor_union reason_for_unemployment
## 1: All other Female Not in universe Not in universe
## 2: All other Male Not in universe Not in universe
## 3: All other Female Not in universe Not in universe
## 4: All other Female Not in universe Not in universe
## 5: All other Female Not in universe Not in universe
## 6: All other Female No Not in universe
## full_parttime_employment_stat capital_gains capital_losses
## 1: Not in labor force 0 0
## 2: Children or Armed Forces 0 0
## 3: Not in labor force 0 0
## 4: Children or Armed Forces 0 0
## 5: Children or Armed Forces 0 0
## 6: Full-time schedules 0 0
## dividend_from_Stocks tax_filer_status region_of_previous_residence
## 1: 0 Nonfiler Not in universe
## 2: 0 Head of household South
## 3: 0 Nonfiler Not in universe
## 4: 0 Nonfiler Not in universe
## 5: 0 Nonfiler Not in universe
## 6: 0 Joint both under 65 Not in universe
## state_of_previous_residence d_household_family_stat
## 1: Not in universe Other Rel 18+ ever marr not in subfamily
## 2: Arkansas Householder
## 3: Not in universe Child 18+ never marr Not in a subfamily
## 4: Not in universe Child <18 never marr not in subfamily
## 5: Not in universe Child <18 never marr not in subfamily
## 6: Not in universe Spouse of householder
## d_household_summary migration_msa migration_reg
## 1: Other relative of householder NA NA
## 2: Householder MSA to MSA Same county
## 3: Child 18 or older NA NA
## 4: Child under 18 never married Nonmover Nonmover
## 5: Child under 18 never married Nonmover Nonmover
## 6: Spouse of householder NA NA
## migration_within_reg live_1_year_ago migration_sunbelt
## 1: NA Not in universe under 1 year old NA
## 2: Same county No Yes
## 3: NA Not in universe under 1 year old NA
## 4: Nonmover Yes Not in universe
## 5: Nonmover Yes Not in universe
## 6: NA Not in universe under 1 year old NA
## num_person_Worked_employer family_members_under_18 country_father
## 1: 0 Not in universe United-States
## 2: 1 Not in universe United-States
## 3: 0 Not in universe Vietnam
## 4: 0 Both parents present United-States
## 5: 0 Both parents present United-States
## 6: 1 Not in universe Philippines
## country_mother country_self citizenship
## 1: United-States United-States Native- Born in the United States
## 2: United-States United-States Native- Born in the United States
## 3: Vietnam Vietnam Foreign born- Not a citizen of U S
## 4: United-States United-States Native- Born in the United States
## 5: United-States United-States Native- Born in the United States
## 6: United-States United-States Native- Born in the United States
## business_or_self_employed fill_questionnaire_veteran_admin
## 1: 0 Not in universe
## 2: 0 Not in universe
## 3: 0 Not in universe
## 4: 0 Not in universe
## 5: 0 Not in universe
## 6: 2 Not in universe
## veterans_benefits weeks_worked_in_year year income_level
## 1: 2 0 95 -50000
## 2: 2 52 94 -50000
## 3: 2 0 95 -50000
## 4: 0 0 94 -50000
## 5: 0 0 94 -50000
## 6: 2 52 95 -50000
head(test)
## age class_of_worker industry_code occupation_code
## 1: 38 Private 6 36
## 2: 44 Self-employed-not incorporated 37 12
## 3: 2 Not in universe 0 0
## 4: 35 Private 29 3
## 5: 49 Private 4 34
## 6: 13 Not in universe 0 0
## education wage_per_hour
## 1: 1st 2nd 3rd or 4th grade 0
## 2: Associates degree-occup /vocational 0
## 3: Children 0
## 4: High school graduate 0
## 5: High school graduate 0
## 6: Children 0
## enrolled_in_edu_inst_lastwk marital_status
## 1: Not in universe Married-civilian spouse present
## 2: Not in universe Married-civilian spouse present
## 3: Not in universe Never married
## 4: Not in universe Divorced
## 5: Not in universe Divorced
## 6: Not in universe Never married
## major_industry_code major_occupation_code
## 1: Manufacturing-durable goods Machine operators assmblrs & inspctrs
## 2: Business and repair services Professional specialty
## 3: Not in universe or children Not in universe
## 4: Transportation Executive admin and managerial
## 5: Construction Precision production craft & repair
## 6: Not in universe or children Not in universe
## race hispanic_origin sex member_of_labor_union
## 1: White Mexican (Mexicano) Female Not in universe
## 2: White All other Female Not in universe
## 3: White Mexican-American Male Not in universe
## 4: White All other Female Not in universe
## 5: White All other Male Not in universe
## 6: White All other Male Not in universe
## reason_for_unemployment full_parttime_employment_stat capital_gains
## 1: Not in universe Full-time schedules 0
## 2: Not in universe PT for econ reasons usually PT 0
## 3: Not in universe Children or Armed Forces 0
## 4: Not in universe Children or Armed Forces 0
## 5: Not in universe Full-time schedules 0
## 6: Not in universe Children or Armed Forces 0
## capital_losses dividend_from_Stocks tax_filer_status
## 1: 0 0 Joint one under 65 & one 65+
## 2: 0 2500 Joint both under 65
## 3: 0 0 Nonfiler
## 4: 0 0 Head of household
## 5: 0 0 Single
## 6: 0 0 Nonfiler
## region_of_previous_residence state_of_previous_residence
## 1: Not in universe Not in universe
## 2: Not in universe Not in universe
## 3: Not in universe Not in universe
## 4: Not in universe Not in universe
## 5: Not in universe Not in universe
## 6: Not in universe Not in universe
## d_household_family_stat d_household_summary
## 1: Spouse of householder Spouse of householder
## 2: Spouse of householder Spouse of householder
## 3: Child <18 never marr not in subfamily Child under 18 never married
## 4: Householder Householder
## 5: Secondary individual Nonrelative of householder
## 6: Child <18 never marr not in subfamily Child under 18 never married
## migration_msa migration_reg migration_within_reg
## 1: NA NA NA
## 2: NA NA NA
## 3: NA NA NA
## 4: Nonmover Nonmover Nonmover
## 5: NA NA NA
## 6: Nonmover Nonmover Nonmover
## live_1_year_ago migration_sunbelt
## 1: Not in universe under 1 year old NA
## 2: Not in universe under 1 year old NA
## 3: Not in universe under 1 year old NA
## 4: Yes Not in universe
## 5: Not in universe under 1 year old NA
## 6: Yes Not in universe
## num_person_Worked_employer family_members_under_18 country_father
## 1: 4 Not in universe Mexico
## 2: 1 Not in universe United-States
## 3: 0 Both parents present United-States
## 4: 5 Not in universe United-States
## 5: 4 Not in universe United-States
## 6: 0 Both parents present Germany
## country_mother country_self citizenship
## 1: Mexico Mexico Foreign born- Not a citizen of U S
## 2: United-States United-States Native- Born in the United States
## 3: United-States United-States Native- Born in the United States
## 4: United-States United-States Native- Born in the United States
## 5: United-States United-States Native- Born in the United States
## 6: United-States United-States Native- Born in the United States
## business_or_self_employed fill_questionnaire_veteran_admin
## 1: 0 Not in universe
## 2: 0 Not in universe
## 3: 0 Not in universe
## 4: 2 Not in universe
## 5: 0 Not in universe
## 6: 0 Not in universe
## veterans_benefits weeks_worked_in_year year income_level
## 1: 2 12 95 -50000
## 2: 2 26 95 -50000
## 3: 0 0 95 -50000
## 4: 2 52 94 -50000
## 5: 2 50 95 -50000
## 6: 0 0 94 -50000
unique(train$income_level)
## [1] "-50000" "+50000"
unique(test$income_level)
## [1] "-50000" "50000+."
we can see denomination of our dependent variable is not same for the train and test data set. since its binary classification problem , we can encode it to 0,1.
train$income_level <- ifelse(train$income_level == "-50000",0,1)
test$income_level <- ifelse(test$income_level == "-50000",0,1)
round(prop.table(table(train$income_level))*100)
##
## 0 1
## 94 6
We see that the majority class has a proportion of 94%. In other words, with a decent ML algorithm, our model would get 94% model accuracy. In absolute figures, it looks incredible. But, our performance would depend on, how good can we predict the minority classes.
as we saw in str() of classes of columns in our data set is not according with set given on dataset site http://archive.ics.uci.edu/ml/machine-learning-databases/census-income-mld/census-income.names. Lets correct this set column classes
factcols <- c(2:5,7,8:16,20:29,31:38,40,41)
numcols <- setdiff(1:40,factcols)
train[,(factcols) := lapply(.SD, factor), .SDcols = factcols]
train[,(numcols) := lapply(.SD, as.numeric), .SDcols = numcols]
test[,(factcols) := lapply(.SD, factor), .SDcols = factcols]
test[,(numcols) := lapply(.SD, as.numeric), .SDcols = numcols]
Now, let’s separate categorical variables & numerical variables. This will help us in further analysis. subset categorical variables
cat_train <- train[,factcols, with=FALSE]
cat_test <- test[,factcols,with=FALSE]
subset numerical variables
num_train <- train[,numcols,with=FALSE]
num_test <- test[,numcols,with=FALSE]
removing train and test dataset to save the memory
rm(train,test)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.4.1
library(plotly)
## Warning: package 'plotly' was built under R version 3.4.1
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
#library(devtools)
#dev_mode(on=T)
#install_github("hadley/ggplot2")
write a plot function
tr <- function(a){
ggplot(data = num_train, aes(x= a, y=..density..)) + geom_histogram(fill="blue",color="red",alpha = 0.5,bins =100) + geom_density()
ggplotly()
}
tr(num_train$age)
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
As we can see, the data set consists of people aged from 0 to 90 with frequency of people declining with age. Now, if we think of the problem we are trying to solve, do you think population below age 20 could earn >50K under normal circumstances? I don’t think so. Therefore, we can bin this variable into age groups.
tr(num_train$capital_losses)
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
add target variable
num_train[,income_level := cat_train$income_level]
Creating Scatter plot
ggplot(data=num_train,aes(x = age, y=wage_per_hour))+geom_point(aes(colour=income_level))+scale_y_continuous("wage per hour", breaks = seq(0,10000,1000))
As we can see, most of the people having income_level 1, seem to fall in the age of 25-65 earning wage of $1000 to $4000 per hour. This plot further strengthens our assumption that age < 20 would have income_level 0, hence we will bin this variable.
ggplot(data=num_train,aes(x = age, y=dividend_from_Stocks))+geom_point(aes(colour=income_level))+scale_y_continuous("dividend from Stocks", breaks = seq(0,100000,10000))
This plot also giving similar kind of trend as previous one.
we can visualize our categorical variables as well. For categories, rather than a bland bar chart, a dodged bar chart provides more information. In dodged bar chart, we plot the categorical variables & dependent variable adjacent to each other Dodged bar chart
all_bar <- function(i){
ggplot(cat_train,aes(x=i,fill=income_level))+geom_bar(position = "dodge", color="black")+scale_fill_brewer(palette = "Pastel1")+theme(axis.text.x =element_text(angle = 60,hjust = 1,size=10))
}
variable class_of_worker
all_bar(cat_train$class_of_worker)
No information provided for Not in Universe category.This variable looks imbalanced i.e. only two category levels seem to dominate. In such situation, a good practice is to combine levels having less than 5% frequency of the total category frequency.
variable education
all_bar(cat_train$education)
we can infer than Bachelors degree holders have the largest proportion of people have income_level 1
We can also check categories is by using 2 way tables. we can create proportionate tables to check the effect of dependent variable per categories as shown:
prop.table(table(cat_train$marital_status,cat_train$income_level),1)
##
## 0 1
## Divorced 0.91612903 0.08387097
## Married-A F spouse present 0.97744361 0.02255639
## Married-civilian spouse present 0.88601553 0.11398447
## Married-spouse absent 0.93675889 0.06324111
## Never married 0.98708447 0.01291553
## Separated 0.95433526 0.04566474
## Widowed 0.96846029 0.03153971
prop.table(table(cat_train$class_of_worker,cat_train$income_level),1)
##
## 0 1
## Federal government 0.795897436 0.204102564
## Local government 0.891187050 0.108812950
## Never worked 0.995444191 0.004555809
## Not in universe 0.990982094 0.009017906
## Private 0.898345088 0.101654912
## Self-employed-incorporated 0.652679939 0.347320061
## Self-employed-not incorporated 0.870929544 0.129070456
## State government 0.885261415 0.114738585
## Without pay 0.993939394 0.006060606
First lets check the missing values in numerical data
sum(is.na(num_train))
## [1] 0
sum(is.na(num_test))
## [1] 0
So, we do not have any missing value for numerical data.
Now we need to check correlation in variables
library(caret)
## Loading required package: lattice
#set threshold as 0.7
num_train$income_level <- NULL
ax = findCorrelation(x=cor(num_train), cutoff = 0.7)
num_train = num_train[,-ax,with = FALSE]
num_test[,weeks_worked_in_year := NULL]
#The variable weeks_worked_in_year gets removed. we've removed that variable from test data too.
mvtr <- sapply(cat_train, function(x){sum(is.na(x))/length(x)})*100
mvte <- sapply(cat_test, function(x){sum(is.na(x)/length(x))}*100)
mvtr
## class_of_worker industry_code
## 0.0000000 0.0000000
## occupation_code education
## 0.0000000 0.0000000
## enrolled_in_edu_inst_lastwk marital_status
## 0.0000000 0.0000000
## major_industry_code major_occupation_code
## 0.0000000 0.0000000
## race hispanic_origin
## 0.0000000 0.4380447
## sex member_of_labor_union
## 0.0000000 0.0000000
## reason_for_unemployment full_parttime_employment_stat
## 0.0000000 0.0000000
## tax_filer_status region_of_previous_residence
## 0.0000000 0.0000000
## state_of_previous_residence d_household_family_stat
## 0.3548463 0.0000000
## d_household_summary migration_msa
## 0.0000000 49.9671717
## migration_reg migration_within_reg
## 49.9671717 49.9671717
## live_1_year_ago migration_sunbelt
## 0.0000000 49.9671717
## family_members_under_18 country_father
## 0.0000000 3.3645244
## country_mother country_self
## 3.0668144 1.7005558
## citizenship business_or_self_employed
## 0.0000000 0.0000000
## fill_questionnaire_veteran_admin veterans_benefits
## 0.0000000 0.0000000
## year income_level
## 0.0000000 0.0000000
mvte
## class_of_worker industry_code
## 0.0000000 0.0000000
## occupation_code education
## 0.0000000 0.0000000
## enrolled_in_edu_inst_lastwk marital_status
## 0.0000000 0.0000000
## major_industry_code major_occupation_code
## 0.0000000 0.0000000
## race hispanic_origin
## 0.0000000 0.4059662
## sex member_of_labor_union
## 0.0000000 0.0000000
## reason_for_unemployment full_parttime_employment_stat
## 0.0000000 0.0000000
## tax_filer_status region_of_previous_residence
## 0.0000000 0.0000000
## state_of_previous_residence d_household_family_stat
## 0.3307873 0.0000000
## d_household_summary migration_msa
## 0.0000000 50.0651551
## migration_reg migration_within_reg
## 50.0651551 50.0651551
## live_1_year_ago migration_sunbelt
## 0.0000000 50.0651551
## family_members_under_18 country_father
## 0.0000000 3.4371805
## country_mother country_self
## 3.0793288 1.7682083
## citizenship business_or_self_employed
## 0.0000000 0.0000000
## fill_questionnaire_veteran_admin veterans_benefits
## 0.0000000 0.0000000
## year income_level
## 0.0000000 0.0000000
We find that some of the variables have ~50% missing values. High proportion of missing value can be attributed to difficulty in data collection. For now, we’ll remove these category levels. A simple subset() function does the trick.
#select columns with missing value less than 5%
cat_train <- subset(cat_train, select = mvtr < 5 )
cat_test <- subset(cat_test, select = mvte < 5)
For the rest of missing values, a nicer approach would be to label them as ‘Unavailable’. Imputing missing values on large data sets can be painstaking. data.table’s set() function makes this computation insanely fast.
#set NA as Unavailable - train data
#convert to characters
cat_train <- cat_train[,names(cat_train) := lapply(.SD, as.character),.SDcols = names(cat_train)]
for (i in seq_along(cat_train)) set(cat_train, i=which(is.na(cat_train[[i]])), j=i, value="Unavailable")
#convert back to factors
cat_train <- cat_train[, names(cat_train) := lapply(.SD,factor), .SDcols = names(cat_train)]
#set NA as Unavailable - test data
cat_test <- cat_test[, (names(cat_test)) := lapply(.SD, as.character), .SDcols = names(cat_test)]
for (i in seq_along(cat_test)) set(cat_test, i=which(is.na(cat_test[[i]])), j=i, value="Unavailable")
#convert back to factors
cat_test <- cat_test[, (names(cat_test)) := lapply(.SD, factor), .SDcols = names(cat_test)]
In previous analysis, we saw that categorical variables have several levels with low frequencies. Such levels don’t help as chances are they wouldn’t be available in test set. We’ll do this hygiene check anyways, in coming steps. To combine levels, a simple for loop does the trick. After combining, the new category level will named as ‘Other’.
#combine factor levels with less than 5% values
#train
for(i in names(cat_train)){
p <- 5/100
ld <- names(which(prop.table(table(cat_train[[i]])) < p))
levels(cat_train[[i]])[levels(cat_train[[i]]) %in% ld] <- "Other"
}
#test
for(i in names(cat_test)){
p <- 5/100
ld <- names(which(prop.table(table(cat_test[[i]])) < p))
levels(cat_test[[i]])[levels(cat_test[[i]]) %in% ld] <- "Other"
}
The parameter “nlevs” returns the unique number of level from the given set of variables.
#check columns with unequal levels
library(mlr)
## Warning: package 'mlr' was built under R version 3.4.1
## Loading required package: ParamHelpers
## Warning: package 'ParamHelpers' was built under R version 3.4.1
##
## Attaching package: 'mlr'
## The following object is masked from 'package:caret':
##
## train
summarizeColumns(cat_train)[,"nlevs"]
## [1] 3 3 2 5 2 5 3 8 3 2 2 3 2 4 4 2 2 6 5 3 4 3 2 2 3 3 2 3 2 2
summarizeColumns(cat_test)[,"nlevs"]
## [1] 3 3 2 5 2 5 3 8 3 2 2 3 2 4 4 2 2 6 5 3 4 3 2 2 3 3 2 3 2 2
let’s look at numeric variables and reflect on possible ways for binning. Since a histogram wasn’t enough for us to make decision, let’s create simple tables representing counts of unique values in these variables as shown
num_train[,.N,age][order(age)]
## age N
## 1: 0 2839
## 2: 1 3138
## 3: 2 3236
## 4: 3 3279
## 5: 4 3318
## 6: 5 3332
## 7: 6 3171
## 8: 7 3218
## 9: 8 3187
## 10: 9 3162
## 11: 10 3134
## 12: 11 3128
## 13: 12 3060
## 14: 13 3152
## 15: 14 3068
## 16: 15 2926
## 17: 16 2882
## 18: 17 2762
## 19: 18 2484
## 20: 19 2419
## 21: 20 2390
## 22: 21 2386
## 23: 22 2573
## 24: 23 2789
## 25: 24 2783
## 26: 25 2783
## 27: 26 2714
## 28: 27 2758
## 29: 28 3013
## 30: 29 3050
## 31: 30 3203
## 32: 31 3351
## 33: 32 3188
## 34: 33 3340
## 35: 34 3489
## 36: 35 3450
## 37: 36 3353
## 38: 37 3278
## 39: 38 3277
## 40: 39 3144
## 41: 40 3114
## 42: 41 3134
## 43: 42 2995
## 44: 43 2889
## 45: 44 2786
## 46: 45 2847
## 47: 46 2816
## 48: 47 2795
## 49: 48 2410
## 50: 49 2142
## 51: 50 2214
## 52: 51 2215
## 53: 52 2115
## 54: 53 1900
## 55: 54 1745
## 56: 55 1730
## 57: 56 1710
## 58: 57 1622
## 59: 58 1600
## 60: 59 1580
## 61: 60 1560
## 62: 61 1497
## 63: 62 1531
## 64: 63 1501
## 65: 64 1579
## 66: 65 1550
## 67: 66 1443
## 68: 67 1496
## 69: 68 1436
## 70: 69 1412
## 71: 70 1410
## 72: 71 1418
## 73: 72 1315
## 74: 73 1354
## 75: 74 1227
## 76: 75 1065
## 77: 76 1050
## 78: 77 979
## 79: 78 876
## 80: 79 811
## 81: 80 799
## 82: 81 720
## 83: 82 615
## 84: 83 561
## 85: 84 519
## 86: 85 423
## 87: 86 348
## 88: 87 301
## 89: 88 241
## 90: 89 195
## 91: 90 725
## age N
num_train[,.N,wage_per_hour][order(-N)]
## wage_per_hour N
## 1: 0 188219
## 2: 500 734
## 3: 600 546
## 4: 700 534
## 5: 800 507
## ---
## 1236: 724 1
## 1237: 1376 1
## 1238: 3156 1
## 1239: 2188 1
## 1240: 1092 1
we are clear that more than 70-80% of the observations are 0 in these variables. Let’s bin these variables accordingly. I used a decision tree to determine the range of resultant bins.
#bin age variable 0-30 31-60 61 - 90
num_train[,age:= cut(x = age,breaks = c(0,30,60,90),include.lowest = TRUE,labels =c("young","adult","old"))]
num_train[,age := factor(age)]
num_test[,age:= cut(x = age,breaks = c(0,30,60,90),include.lowest = TRUE,labels = c("young","adult","old"))]
num_test[,age := factor(age)]
#Bin numeric variables with Zero and MoreThanZero
num_train[,wage_per_hour := ifelse(wage_per_hour == 0,"Zero","MoreThanZero")][,wage_per_hour := as.factor(wage_per_hour)]
num_train[,capital_gains := ifelse(capital_gains == 0,"Zero","MoreThanZero")][,capital_gains := as.factor(capital_gains)]
num_train[,capital_losses := ifelse(capital_losses == 0,"Zero","MoreThanZero")][,capital_losses := as.factor(capital_losses)]
num_train[,dividend_from_Stocks := ifelse(dividend_from_Stocks == 0,"Zero","MoreThanZero")][,dividend_from_Stocks := as.factor(dividend_from_Stocks)]
num_test[,wage_per_hour := ifelse(wage_per_hour == 0,"Zero","MoreThanZero")][,wage_per_hour := as.factor(wage_per_hour)]
num_test[,capital_gains := ifelse(capital_gains == 0,"Zero","MoreThanZero")][,capital_gains := as.factor(capital_gains)]
num_test[,capital_losses := ifelse(capital_losses == 0,"Zero","MoreThanZero")][,capital_losses := as.factor(capital_losses)]
num_test[,dividend_from_Stocks := ifelse(dividend_from_Stocks == 0,"Zero","MoreThanZero")][,dividend_from_Stocks := as.factor(dividend_from_Stocks)]
#combine data and make test & train files
d_train <- cbind(num_train,cat_train)
d_test <- cbind(num_test,cat_test)
#remove unwanted files
rm(num_train,num_test,cat_train,cat_test) #save memory
Create Task
train.task <- makeClassifTask(data =d_train,target = "income_level")
## Warning in makeTask(type = type, data = data, weights = weights, blocking
## = blocking, : Provided data is not a pure data.frame but from class
## data.table, hence it will be converted.
test.task <- makeClassifTask(data=d_test,target = "income_level")
## Warning in makeTask(type = type, data = data, weights = weights, blocking
## = blocking, : Provided data is not a pure data.frame but from class
## data.table, hence it will be converted.
train.task <- removeConstantFeatures(train.task)
test.task <- removeConstantFeatures(test.task)
library('FSelector')
## Warning: package 'FSelector' was built under R version 3.4.1
var_imp <- generateFilterValuesData(train.task, method = c("information.gain"))
plotFilterValues(var_imp,feat.type.cols = TRUE)
The variable major_occupation_code would provide highest information to the model followed by other variables in descending order. This chart is deduced using a tree algorithm, where at every split, the information is calculated using reduction in entropy (homogeneity). Let’s keep this knowledge safe, we might use it in coming steps.
Now, we’ll try to make our data balanced using various techniques such as over sampling, undersampling and SMOTE. In SMOTE, the algorithm looks at n nearest neighbors, measures the distance between them and introduces a new observation at the center of n observations. While proceeding, we must keep in mind that these techniques have their own drawbacks such as:
undersampling leads to loss of information oversampling leads to overestimation of minority class
We will try all the three techniques.
#undersampling
train.under <- undersample(train.task,rate = 0.1) #keep only 10% of majority class
table(getTaskTargets(train.under))
##
## 0 1
## 18714 12382
#oversampling
train.over <- oversample(train.task,rate=15) #make minority class 15 times
table(getTaskTargets(train.over))
##
## 0 1
## 187141 185730
#SMOTE
#train.smote <- smote(train.task,rate =5,nn = 3)
#Due to system limitation we are unable to use SMOTE technique.
#lets see which algorithms are available
listLearners("classif","twoclass")[c("class","package")]
## Warning in listLearners.character("classif", "twoclass"): The following learners could not be constructed, probably because their packages are not installed:
## classif.ada,classif.bartMachine,classif.bdk,classif.blackboost,classif.boosting,classif.bst,classif.C50,classif.cforest,classif.clusterSVM,classif.ctree,classif.cvglmnet,classif.dbnDNN,classif.dcSVM,classif.earth,classif.evtree,classif.extraTrees,classif.fnn,classif.gamboost,classif.gaterSVM,classif.gausspr,classif.geoDA,classif.glmboost,classif.glmnet,classif.h2o.deeplearning,classif.h2o.gbm,classif.h2o.glm,classif.h2o.randomForest,classif.hdrda,classif.kknn,classif.ksvm,classif.LiblineaRL1L2SVC,classif.LiblineaRL1LogReg,classif.LiblineaRL2L1SVC,classif.LiblineaRL2LogReg,classif.LiblineaRL2SVC,classif.LiblineaRMultiClassSVC,classif.linDA,classif.lqa,classif.lssvm,classif.mda,classif.mlp,classif.neuralnet,classif.nnTrain,classif.nodeHarvest,classif.pamr,classif.penalized.fusedlasso,classif.penalized.lasso,classif.penalized.ridge,classif.plr,classif.quaDA,classif.randomForestSRC,classif.ranger,classif.rda,classif.rFerns,classif.rknn,classif.rotationForest,classif.RRF,classif.rrlda,classif.saeDNN,classif.sda,classif.sparseLDA,classif.xyf,cluster.cmeans,cluster.dbscan,cluster.kkmeans,cluster.kmeans,multilabel.cforest,multilabel.randomForestSRC,multilabel.rFerns,regr.bartMachine,regr.bcart,regr.bdk,regr.bgp,regr.bgpllm,regr.blackboost,regr.blm,regr.brnn,regr.bst,regr.btgp,regr.btgpllm,regr.btlm,regr.cforest,regr.crs,regr.ctree,regr.cubist,regr.cvglmnet,regr.earth,regr.elmNN,regr.evtree,regr.extraTrees,regr.fnn,regr.frbs,regr.gamboost,regr.gausspr,regr.glmboost,regr.glmnet,regr.GPfit,regr.h2o.deeplearning,regr.h2o.gbm,regr.h2o.glm,regr.h2o.randomForest,regr.kknn,regr.km,regr.ksvm,regr.laGP,regr.LiblineaRL2L1SVR,regr.LiblineaRL2L2SVR,regr.mars,regr.mob,regr.nodeHarvest,regr.pcr,regr.penalized.fusedlasso,regr.penalized.lasso,regr.penalized.ridge,regr.plsr,regr.randomForestSRC,regr.ranger,regr.rknn,regr.RRF,regr.rsm,regr.rvm,regr.slim,regr.xyf,surv.cforest,surv.CoxBoost,surv.cv.CoxBoost,surv.cvglmnet,surv.gamboost,surv.glmboost,surv.glmnet,surv.penalized.fusedlasso,surv.penalized.lasso,surv.penalized.ridge,surv.randomForestSRC,surv.ranger
## Check ?learners to see which packages you need or install mlr with all suggestions.
## class package
## 1 classif.binomial stats
## 2 classif.featureless mlr
## 3 classif.gbm gbm
## 4 classif.IBk RWeka
## 5 classif.J48 RWeka
## 6 classif.JRip RWeka
## ... (22 rows, 2 cols)
We’ll start with naive Bayes, an algorithms based on bayes theorem. In case of high dimensional data like text-mining, naive Bayes tends to do wonders in accuracy. It works on categorical data. In case of numeric variables, a normal distribution is considered for these variables and a mean and standard deviation is calculated. Then, using some standard z-table calculations probabilities can be estimated for each of your continuous variables to make the naive Bayes classifier.
We’ll use naive Bayes on all 4 data sets (imbalanced, oversample, undersample and SMOTE(right now we dont have this one )) and compare the prediction accuracy using cross validation. Following are the metrics we’ll use to evaluate our predictive accuracy: Sensitivity = True Positive Rate (TP/TP+FN) - It says, ‘out of all the positive (majority class) values, how many have been predicted correctly’. Specificity = True Negative Rate (TN/TN +FP) - It says, ‘out of all the negative (minority class) values, how many have been predicted correctly’. Precision = (TP/TP+FP) Recall = Sensitivity F score = 2 * (Precision * Recall)/ (Precision + Recall) - It is the harmonic mean of precision and recall. It is used to compare several models side-by-side. Higher the better
#naive Bayes
naive_learner <- makeLearner("classif.naiveBayes",predict.type = "response")
naive_learner$par.vals <- list(laplace = 1)
folds <- makeResampleDesc("CV",iters=10,stratify = TRUE)
#cross validation function
fun_cv <- function(a){
crv_val <- resample(naive_learner,a,folds,measures = list(acc,tpr,tnr,fpr,fp,fn))
crv_val$aggr
}
fun_cv (train.task)
## [Resample] cross-validation iter 1: acc.test.mean=0.733,tpr.test.mean=0.722,tnr.test.mean=0.898,fpr.test.mean=0.102,fp.test.mean= 126,fn.test.mean=5.21e+03
## [Resample] cross-validation iter 2: acc.test.mean=0.729,tpr.test.mean=0.719,tnr.test.mean=0.889,fpr.test.mean=0.111,fp.test.mean= 138,fn.test.mean=5.26e+03
## [Resample] cross-validation iter 3: acc.test.mean=0.731,tpr.test.mean=0.721,tnr.test.mean=0.889,fpr.test.mean=0.111,fp.test.mean= 138,fn.test.mean=5.23e+03
## [Resample] cross-validation iter 4: acc.test.mean=0.734,tpr.test.mean=0.723,tnr.test.mean=0.906,fpr.test.mean=0.0944,fp.test.mean= 117,fn.test.mean=5.19e+03
## [Resample] cross-validation iter 5: acc.test.mean=0.737,tpr.test.mean=0.726,tnr.test.mean=0.898,fpr.test.mean=0.102,fp.test.mean= 126,fn.test.mean=5.12e+03
## [Resample] cross-validation iter 6: acc.test.mean=0.725,tpr.test.mean=0.713,tnr.test.mean=0.901,fpr.test.mean=0.0994,fp.test.mean= 123,fn.test.mean=5.36e+03
## [Resample] cross-validation iter 7: acc.test.mean=0.733,tpr.test.mean=0.722,tnr.test.mean= 0.9,fpr.test.mean= 0.1,fp.test.mean= 124,fn.test.mean=5.2e+03
## [Resample] cross-validation iter 8: acc.test.mean=0.733,tpr.test.mean=0.722,tnr.test.mean=0.893,fpr.test.mean=0.107,fp.test.mean= 132,fn.test.mean=5.2e+03
## [Resample] cross-validation iter 9: acc.test.mean=0.734,tpr.test.mean=0.724,tnr.test.mean=0.891,fpr.test.mean=0.109,fp.test.mean= 135,fn.test.mean=5.16e+03
## [Resample] cross-validation iter 10: acc.test.mean=0.734,tpr.test.mean=0.723,tnr.test.mean=0.896,fpr.test.mean=0.104,fp.test.mean= 129,fn.test.mean=5.19e+03
## [Resample] Aggr. Result: acc.test.mean=0.732,tpr.test.mean=0.721,tnr.test.mean=0.896,fpr.test.mean=0.104,fp.test.mean= 129,fn.test.mean=5.21e+03
## acc.test.mean tpr.test.mean tnr.test.mean fpr.test.mean fp.test.mean
## 0.7322815 0.7214507 0.8959775 0.1040225 128.8000000
## fn.test.mean
## 5212.8000000
fun_cv(train.under)
## [Resample] cross-validation iter 1: acc.test.mean=0.769,tpr.test.mean=0.67,tnr.test.mean=0.918,fpr.test.mean=0.0823,fp.test.mean= 102,fn.test.mean= 617
## [Resample] cross-validation iter 2: acc.test.mean=0.767,tpr.test.mean=0.667,tnr.test.mean=0.918,fpr.test.mean=0.0816,fp.test.mean= 101,fn.test.mean= 623
## [Resample] cross-validation iter 3: acc.test.mean=0.751,tpr.test.mean=0.645,tnr.test.mean=0.912,fpr.test.mean=0.088,fp.test.mean= 109,fn.test.mean= 664
## [Resample] cross-validation iter 4: acc.test.mean=0.77,tpr.test.mean=0.678,tnr.test.mean=0.908,fpr.test.mean=0.092,fp.test.mean= 114,fn.test.mean= 602
## [Resample] cross-validation iter 5: acc.test.mean=0.762,tpr.test.mean=0.659,tnr.test.mean=0.917,fpr.test.mean=0.0832,fp.test.mean= 103,fn.test.mean= 638
## [Resample] cross-validation iter 6: acc.test.mean=0.764,tpr.test.mean=0.671,tnr.test.mean=0.903,fpr.test.mean=0.0969,fp.test.mean= 120,fn.test.mean= 615
## [Resample] cross-validation iter 7: acc.test.mean=0.758,tpr.test.mean=0.657,tnr.test.mean=0.912,fpr.test.mean=0.088,fp.test.mean= 109,fn.test.mean= 642
## [Resample] cross-validation iter 8: acc.test.mean=0.768,tpr.test.mean=0.671,tnr.test.mean=0.914,fpr.test.mean=0.0856,fp.test.mean= 106,fn.test.mean= 616
## [Resample] cross-validation iter 9: acc.test.mean=0.755,tpr.test.mean=0.65,tnr.test.mean=0.915,fpr.test.mean=0.0848,fp.test.mean= 105,fn.test.mean= 656
## [Resample] cross-validation iter 10: acc.test.mean=0.768,tpr.test.mean=0.67,tnr.test.mean=0.915,fpr.test.mean=0.0848,fp.test.mean= 105,fn.test.mean= 617
## [Resample] Aggr. Result: acc.test.mean=0.763,tpr.test.mean=0.664,tnr.test.mean=0.913,fpr.test.mean=0.0867,fp.test.mean= 107,fn.test.mean= 629
## acc.test.mean tpr.test.mean tnr.test.mean fpr.test.mean fp.test.mean
## 0.76318458 0.66388829 0.91326125 0.08673875 107.40000000
## fn.test.mean
## 629.00000000
fun_cv(train.over)
## [Resample] cross-validation iter 1: acc.test.mean=0.783,tpr.test.mean=0.651,tnr.test.mean=0.916,fpr.test.mean=0.0835,fp.test.mean=1.55e+03,fn.test.mean=6.52e+03
## [Resample] cross-validation iter 2: acc.test.mean=0.785,tpr.test.mean=0.654,tnr.test.mean=0.917,fpr.test.mean=0.0826,fp.test.mean=1.54e+03,fn.test.mean=6.47e+03
## [Resample] cross-validation iter 3: acc.test.mean=0.786,tpr.test.mean=0.657,tnr.test.mean=0.917,fpr.test.mean=0.0831,fp.test.mean=1.54e+03,fn.test.mean=6.42e+03
## [Resample] cross-validation iter 4: acc.test.mean=0.785,tpr.test.mean=0.659,tnr.test.mean=0.913,fpr.test.mean=0.0871,fp.test.mean=1.62e+03,fn.test.mean=6.39e+03
## [Resample] cross-validation iter 5: acc.test.mean=0.785,tpr.test.mean=0.658,tnr.test.mean=0.913,fpr.test.mean=0.0871,fp.test.mean=1.62e+03,fn.test.mean=6.4e+03
## [Resample] cross-validation iter 6: acc.test.mean=0.785,tpr.test.mean=0.657,tnr.test.mean=0.914,fpr.test.mean=0.086,fp.test.mean=1.6e+03,fn.test.mean=6.42e+03
## [Resample] cross-validation iter 7: acc.test.mean=0.786,tpr.test.mean=0.658,tnr.test.mean=0.915,fpr.test.mean=0.0847,fp.test.mean=1.57e+03,fn.test.mean=6.4e+03
## [Resample] cross-validation iter 8: acc.test.mean=0.786,tpr.test.mean=0.656,tnr.test.mean=0.916,fpr.test.mean=0.084,fp.test.mean=1.56e+03,fn.test.mean=6.43e+03
## [Resample] cross-validation iter 9: acc.test.mean=0.786,tpr.test.mean=0.657,tnr.test.mean=0.915,fpr.test.mean=0.0845,fp.test.mean=1.57e+03,fn.test.mean=6.41e+03
## [Resample] cross-validation iter 10: acc.test.mean=0.786,tpr.test.mean=0.658,tnr.test.mean=0.915,fpr.test.mean=0.0849,fp.test.mean=1.58e+03,fn.test.mean=6.4e+03
## [Resample] Aggr. Result: acc.test.mean=0.785,tpr.test.mean=0.657,tnr.test.mean=0.915,fpr.test.mean=0.0848,fp.test.mean=1.57e+03,fn.test.mean=6.43e+03
## acc.test.mean tpr.test.mean tnr.test.mean fpr.test.mean fp.test.mean
## 7.854352e-01 6.566119e-01 9.152372e-01 8.476283e-02 1.574300e+03
## fn.test.mean
## 6.426200e+03
#fun_cv(train.smote) (dont have )
#train and predict
detach("package:caret", unload=TRUE)
nB_model <- train(naive_learner,train.over)
nB_predict <- predict(nB_model,test.task)
#evaluate
nB_prediction <- nB_predict$data$response
library(caret)
##
## Attaching package: 'caret'
## The following object is masked from 'package:mlr':
##
## train
dCM <- confusionMatrix(d_test$income_level,nB_prediction)
dCM
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 61523 32053
## 1 511 5675
##
## Accuracy : 0.6736
## 95% CI : (0.6707, 0.6765)
## No Information Rate : 0.6218
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.17
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9918
## Specificity : 0.1504
## Pos Pred Value : 0.6575
## Neg Pred Value : 0.9174
## Prevalence : 0.6218
## Detection Rate : 0.6167
## Detection Prevalence : 0.9380
## Balanced Accuracy : 0.5711
##
## 'Positive' Class : 0
##
#calculate F measure
precision <- dCM$byClass['Pos Pred Value']
recall <- dCM$byClass['Sensitivity']
Specificity <- dCM$byClass['Specificity']
Specificity
## Specificity
## 0.1504188
f_measure <- 2*((precision*recall)/(precision+recall))
f_measure
## Pos Pred Value
## 0.7907332
Naive bays performing very poorly with minority classes,only 28% predicted correclty. Let’s try xgboost algorithm
#xgboost
library(xgboost)
## Warning: package 'xgboost' was built under R version 3.4.1
##
## Attaching package: 'xgboost'
## The following object is masked from 'package:plotly':
##
## slice
set.seed(2002)
xgb_learner <- makeLearner("classif.xgboost",predict.type = "response")
xgb_learner$par.vals <- list(
objective = "binary:logistic",
eval_metric = "error",
nrounds = 150,
print.every.n = 50
)
#define hyperparameters for tuning
xg_ps <- makeParamSet(
makeIntegerParam("max_depth",lower=3,upper=10),
makeNumericParam("lambda",lower=0.05,upper=0.5),
makeNumericParam("eta", lower = 0.01, upper = 0.5),
makeNumericParam("subsample", lower = 0.50, upper = 1),
makeNumericParam("min_child_weight",lower=2,upper=10),
makeNumericParam("colsample_bytree",lower = 0.50,upper = 0.80)
)
#define search function
rancontrol <- makeTuneControlRandom(maxit = 5L) #do 5 iterations
#5 fold cross validation
set_cv <- makeResampleDesc("CV",iters = 5L,stratify = TRUE)
#tune parameters
train.task <- createDummyFeatures(train.task)
test.task <- createDummyFeatures(test.task)
xgb_tune <- tuneParams(learner = xgb_learner, task = train.task, resampling = set_cv, measures = list(acc,tpr,tnr,fpr,fp,fn), par.set = xg_ps, control = rancontrol)
## [Tune] Started tuning learner classif.xgboost for parameter set:
## Type len Def Constr Req Tunable Trafo
## max_depth integer - - 3 to 10 - TRUE -
## lambda numeric - - 0.05 to 0.5 - TRUE -
## eta numeric - - 0.01 to 0.5 - TRUE -
## subsample numeric - - 0.5 to 1 - TRUE -
## min_child_weight numeric - - 2 to 10 - TRUE -
## colsample_bytree numeric - - 0.5 to 0.8 - TRUE -
## With control class: TuneControlRandom
## Imputation value: -0Imputation value: -0Imputation value: -0Imputation value: 1Imputation value: InfImputation value: Inf
## [Tune-x] 1: max_depth=5; lambda=0.146; eta=0.374; subsample=0.841; min_child_weight=8.23; colsample_bytree=0.587
## Warning: 'print.every.n' is deprecated.
## Use 'print_every_n' instead.
## See help("Deprecated") and help("xgboost-deprecated").
## [1] train-error:0.059191
## [51] train-error:0.049957
## [101] train-error:0.048528
## [150] train-error:0.047845
## Warning: 'print.every.n' is deprecated.
## Use 'print_every_n' instead.
## See help("Deprecated") and help("xgboost-deprecated").
## [1] train-error:0.055996
## [51] train-error:0.049919
## [101] train-error:0.048297
## [150] train-error:0.047470
## Warning: 'print.every.n' is deprecated.
## Use 'print_every_n' instead.
## See help("Deprecated") and help("xgboost-deprecated").
## [1] train-error:0.057662
## [51] train-error:0.050126
## [101] train-error:0.048628
## [150] train-error:0.047732
## Warning: 'print.every.n' is deprecated.
## Use 'print_every_n' instead.
## See help("Deprecated") and help("xgboost-deprecated").
## [1] train-error:0.058089
## [51] train-error:0.049844
## [101] train-error:0.048096
## [150] train-error:0.047325
## Warning: 'print.every.n' is deprecated.
## Use 'print_every_n' instead.
## See help("Deprecated") and help("xgboost-deprecated").
## [1] train-error:0.058076
## [51] train-error:0.049631
## [101] train-error:0.048039
## [150] train-error:0.047200
## [Tune-y] 1: acc.test.mean=0.948,tpr.test.mean=0.987,tnr.test.mean=0.354,fpr.test.mean=0.646,fp.test.mean=1.6e+03,fn.test.mean= 484; time: 5.3 min
## [Tune-x] 2: max_depth=3; lambda=0.243; eta=0.233; subsample=0.936; min_child_weight=2.92; colsample_bytree=0.793
## Warning: 'print.every.n' is deprecated.
## Use 'print_every_n' instead.
## See help("Deprecated") and help("xgboost-deprecated").
## [1] train-error:0.061998
## [51] train-error:0.052331
## [101] train-error:0.051554
## [150] train-error:0.050934
## Warning: 'print.every.n' is deprecated.
## Use 'print_every_n' instead.
## See help("Deprecated") and help("xgboost-deprecated").
## [1] train-error:0.061829
## [51] train-error:0.051962
## [101] train-error:0.051366
## [150] train-error:0.050884
## Warning: 'print.every.n' is deprecated.
## Use 'print_every_n' instead.
## See help("Deprecated") and help("xgboost-deprecated").
## [1] train-error:0.062060
## [51] train-error:0.052162
## [101] train-error:0.051473
## [150] train-error:0.051040
## Warning: 'print.every.n' is deprecated.
## Use 'print_every_n' instead.
## See help("Deprecated") and help("xgboost-deprecated").
## [1] train-error:0.062054
## [51] train-error:0.052507
## [101] train-error:0.051542
## [150] train-error:0.051047
## Warning: 'print.every.n' is deprecated.
## Use 'print_every_n' instead.
## See help("Deprecated") and help("xgboost-deprecated").
## [1] train-error:0.062060
## [51] train-error:0.051742
## [101] train-error:0.051354
## [150] train-error:0.050771
## [Tune-y] 2: acc.test.mean=0.948,tpr.test.mean=0.989,tnr.test.mean=0.332,fpr.test.mean=0.668,fp.test.mean=1.65e+03,fn.test.mean= 410; time: 2.9 min
## [Tune-x] 3: max_depth=7; lambda=0.259; eta=0.264; subsample=0.66; min_child_weight=7.25; colsample_bytree=0.714
## Warning: 'print.every.n' is deprecated.
## Use 'print_every_n' instead.
## See help("Deprecated") and help("xgboost-deprecated").
## [1] train-error:0.054267
## [51] train-error:0.048666
## [101] train-error:0.047081
## [150] train-error:0.045302
## Warning: 'print.every.n' is deprecated.
## Use 'print_every_n' instead.
## See help("Deprecated") and help("xgboost-deprecated").
## [1] train-error:0.053941
## [51] train-error:0.048817
## [101] train-error:0.046580
## [150] train-error:0.045126
## Warning: 'print.every.n' is deprecated.
## Use 'print_every_n' instead.
## See help("Deprecated") and help("xgboost-deprecated").
## [1] train-error:0.055438
## [51] train-error:0.049042
## [101] train-error:0.046899
## [150] train-error:0.045496
## Warning: 'print.every.n' is deprecated.
## Use 'print_every_n' instead.
## See help("Deprecated") and help("xgboost-deprecated").
## [1] train-error:0.055589
## [51] train-error:0.048666
## [101] train-error:0.047119
## [150] train-error:0.045396
## Warning: 'print.every.n' is deprecated.
## Use 'print_every_n' instead.
## See help("Deprecated") and help("xgboost-deprecated").
## [1] train-error:0.055971
## [51] train-error:0.048503
## [101] train-error:0.046323
## [150] train-error:0.045264
## [Tune-y] 3: acc.test.mean=0.947,tpr.test.mean=0.986,tnr.test.mean=0.356,fpr.test.mean=0.644,fp.test.mean=1.6e+03,fn.test.mean= 508; time: 7.2 min
## [Tune-x] 4: max_depth=5; lambda=0.171; eta=0.295; subsample=0.855; min_child_weight=5.54; colsample_bytree=0.735
## Warning: 'print.every.n' is deprecated.
## Use 'print_every_n' instead.
## See help("Deprecated") and help("xgboost-deprecated").
## [1] train-error:0.056153
## [51] train-error:0.050276
## [101] train-error:0.048372
## [150] train-error:0.047695
## Warning: 'print.every.n' is deprecated.
## Use 'print_every_n' instead.
## See help("Deprecated") and help("xgboost-deprecated").
## [1] train-error:0.056911
## [51] train-error:0.050232
## [101] train-error:0.048773
## [150] train-error:0.047532
## Warning: 'print.every.n' is deprecated.
## Use 'print_every_n' instead.
## See help("Deprecated") and help("xgboost-deprecated").
## [1] train-error:0.057600
## [51] train-error:0.050207
## [101] train-error:0.048929
## [150] train-error:0.048190
## Warning: 'print.every.n' is deprecated.
## Use 'print_every_n' instead.
## See help("Deprecated") and help("xgboost-deprecated").
## [1] train-error:0.057707
## [51] train-error:0.050326
## [101] train-error:0.048810
## [150] train-error:0.047520
## Warning: 'print.every.n' is deprecated.
## Use 'print_every_n' instead.
## See help("Deprecated") and help("xgboost-deprecated").
## [1] train-error:0.059203
## [51] train-error:0.050119
## [101] train-error:0.048628
## [150] train-error:0.047638
## [Tune-y] 4: acc.test.mean=0.948,tpr.test.mean=0.988,tnr.test.mean=0.351,fpr.test.mean=0.649,fp.test.mean=1.61e+03,fn.test.mean= 463; time: 4.4 min
## [Tune-x] 5: max_depth=5; lambda=0.279; eta=0.28; subsample=0.63; min_child_weight=7.08; colsample_bytree=0.572
## Warning: 'print.every.n' is deprecated.
## Use 'print_every_n' instead.
## See help("Deprecated") and help("xgboost-deprecated").
## [1] train-error:0.057587
## [51] train-error:0.050345
## [101] train-error:0.049249
## [150] train-error:0.048660
## Warning: 'print.every.n' is deprecated.
## Use 'print_every_n' instead.
## See help("Deprecated") and help("xgboost-deprecated").
## [1] train-error:0.057161
## [51] train-error:0.050640
## [101] train-error:0.048967
## [150] train-error:0.048278
## Warning: 'print.every.n' is deprecated.
## Use 'print_every_n' instead.
## See help("Deprecated") and help("xgboost-deprecated").
## [1] train-error:0.058063
## [51] train-error:0.050539
## [101] train-error:0.049342
## [150] train-error:0.048534
## Warning: 'print.every.n' is deprecated.
## Use 'print_every_n' instead.
## See help("Deprecated") and help("xgboost-deprecated").
## [1] train-error:0.057663
## [51] train-error:0.050395
## [101] train-error:0.049055
## [150] train-error:0.048221
## Warning: 'print.every.n' is deprecated.
## Use 'print_every_n' instead.
## See help("Deprecated") and help("xgboost-deprecated").
## [1] train-error:0.057950
## [51] train-error:0.050301
## [101] train-error:0.049117
## [150] train-error:0.048271
## [Tune-y] 5: acc.test.mean=0.948,tpr.test.mean=0.988,tnr.test.mean=0.352,fpr.test.mean=0.648,fp.test.mean=1.6e+03,fn.test.mean= 467; time: 4.8 min
## [Tune] Result: max_depth=3; lambda=0.243; eta=0.233; subsample=0.936; min_child_weight=2.92; colsample_bytree=0.793 : acc.test.mean=0.948,tpr.test.mean=0.989,tnr.test.mean=0.332,fpr.test.mean=0.668,fp.test.mean=1.65e+03,fn.test.mean= 410
#Now, we can use these parameter for modeling using xgb_tune$x which contains the best tuned parameters.
#set optimal parameters
xgb_new <- setHyperPars(learner = xgb_learner, par.vals = xgb_tune$x)
#train model
detach("package:caret", unload=TRUE)
xgmodel <- train(xgb_new, train.task)
## Warning: 'print.every.n' is deprecated.
## Use 'print_every_n' instead.
## See help("Deprecated") and help("xgboost-deprecated").
## [1] train-error:0.062058
## [51] train-error:0.052124
## [101] train-error:0.051658
## [150] train-error:0.051037
#test model
predict.xg <- predict(xgmodel, test.task)
#make prediction
xg_prediction <- predict.xg$data$response
#make confusion matrix
library(caret)
##
## Attaching package: 'caret'
## The following object is masked from 'package:mlr':
##
## train
xg_confused <- confusionMatrix(d_test$income_level,xg_prediction)
precision <- xg_confused$byClass['Pos Pred Value']
recall <- xg_confused$byClass['Sensitivity']
Specificity <- dCM$byClass['Specificity']
Specificity
## Specificity
## 0.1504188
f_measure <- 2*((precision*recall)/(precision+recall))
f_measure
## Pos Pred Value
## 0.9727197
we can see, xgboost is able to predict minority class with 65% accuracy ,it has outperformed naive Bayes model’s accuracy.
Until now, our model has been making label predictions. The threshold used for making these predictions in 0.5 as seen by
predict.xg$threshold
## [1] NA
Due to imbalanced nature of the data, the threshold of 0.5 will always favor the majority class since the probability of a class 1 is quite low. Now, we’ll try a new technique:
Instead of labels, we’ll predict probabilities Plot and study the AUC curve Adjust the threshold for better prediction We’ll continue using xgboost ,To do this, we need to change the predict.type parameter while defining learner.
#xgboost AUC
xgb_prob <- setPredictType(learner = xgb_new,predict.type = "prob")
#train model
detach("package:caret", unload=TRUE)
xgmodel_prob <- train(xgb_prob,train.task)
## Warning: 'print.every.n' is deprecated.
## Use 'print_every_n' instead.
## See help("Deprecated") and help("xgboost-deprecated").
## [1] train-error:0.062058
## [51] train-error:0.052230
## [101] train-error:0.051618
## [150] train-error:0.051017
#predict
predict.xgprob <- predict(xgmodel_prob,test.task)
Now, let’s look at the probability table thus created:
predicted probabilities
predict.xgprob$data[1:10,]
## id truth prob.0 prob.1 response
## 1 1 0 0.9944599 5.540112e-03 0
## 2 2 0 0.7511105 2.488895e-01 0
## 3 3 0 0.9999678 3.223390e-05 0
## 4 4 0 0.9545683 4.543170e-02 0
## 5 5 0 0.9536002 4.639978e-02 0
## 6 6 0 0.9999555 4.451033e-05 0
## 7 7 0 0.9999761 2.392266e-05 0
## 8 8 0 0.9966745 3.325516e-03 0
## 9 9 0 0.8369040 1.630960e-01 0
## 10 10 0 0.9999709 2.905068e-05 0
Since, we have obtained the class probabilities, let’s create an AUC curve and determine the basis to modify prediction threshold.
df <- generateThreshVsPerfData(predict.xgprob,measures = list(fpr,tpr))
plotROCCurves(df)
AUC is a measure of true positive rate and false positive rate. We aim to reach as close to top left corner as possible. Therefore, we should aim to reduce the threshold so that the false positive rate can be reduced.
#set threshold as 0.4
pred2 <- setThreshold(predict.xgprob,0.4)
library(caret)
##
## Attaching package: 'caret'
## The following object is masked from 'package:mlr':
##
## train
confusionMatrix(d_test$income_level,pred2$data$response)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 92989 587
## 1 4644 1542
##
## Accuracy : 0.9476
## 95% CI : (0.9462, 0.9489)
## No Information Rate : 0.9787
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.3503
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.9524
## Specificity : 0.7243
## Pos Pred Value : 0.9937
## Neg Pred Value : 0.2493
## Prevalence : 0.9787
## Detection Rate : 0.9321
## Detection Prevalence : 0.9380
## Balanced Accuracy : 0.8384
##
## 'Positive' Class : 0
##
We can see ,we are ablt to predict minority classses with 72.71 % accuracy. Setting threshold using AUC curve actually affect our model performance. Let’s further change it to 0.30
pred3 <- setThreshold(predict.xgprob,0.30)
confusionMatrix(d_test$income_level,pred3$data$response)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 93312 264
## 1 5227 959
##
## Accuracy : 0.945
## 95% CI : (0.9435, 0.9464)
## No Information Rate : 0.9877
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.2434
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.9470
## Specificity : 0.7841
## Pos Pred Value : 0.9972
## Neg Pred Value : 0.1550
## Prevalence : 0.9877
## Detection Rate : 0.9353
## Detection Prevalence : 0.9380
## Balanced Accuracy : 0.8655
##
## 'Positive' Class : 0
##
This model has outperformed all our models ,wtih 78% of the minority classes have been predicted correctly.